package com.coos.util;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.UnknownHostException;

/**
 * 根据url获取网页内容方法
 * @author zheng
 *
 */
public class GetURLContent {
	
	public static void main(String[] args) {
		//String url = "http://www.baidu.com";
		//String bodyStr = GetURLContent.getBodyStr(url);
		//System.out.println(bodyStr);
	}
	
	/**
	 * 把body里面的内容变成一行，便于分析判断
	 * @param urlname
	 * @return
	 */
	public static String getBodyStr(String urlname) {
		if ("".equals(getBody(urlname))) {
			return "";
		}
		//java中替换\r\n必须用\\r\\n,双反斜杠才表示一个反斜杠
		return getBody(urlname).replaceAll("\\r\\n|\\n|\\r", "").replaceAll("\\s{2,}", " ");
	}

	/**
	 * 截取网页body里面的内容
	 * @param urlname
	 * @return
	 */
	public static String getBody(String urlname) {
		if ("".equals(getContent(urlname))) {
			return "";
		}
		String str = "";
		String body = "";
		try {
			str = getContent(urlname).split("<body")[1];
			str = str.split("body>")[0];
			body = "<body" + str + "body>";
		} catch (Exception e) {
			return "";
		}
		return body;
	}

	/**
	 * 根据url获取网页内容
	 * @param urlname
	 * @return
	 */
	public static String getContent(String urlname) {
		String message = "";
		try {
			URL url = new URL(urlname);
			HttpURLConnection connect = (HttpURLConnection) url
					.openConnection();
			connect.setRequestProperty("User-agent", "Mozilla/4.0");
			//网络不通时报错 java.net.UnknownHostException: www.coomix.net
			connect.connect();
			//域名不存在时报错java.net.SocketException: Unexpected end of file from server
			InputStream is = connect.getInputStream();

			//网页的读取不到getContentLength 图片的才能取getContentLength
//			int filesize = connect.getContentLength();

			byte[] bf = new byte[1];
			StringBuffer content = new StringBuffer();
			while ((is.read(bf)) != -1) {
			//is.read()直接判断会删除掉一个字符
//			while (is.read() != -1) {
//			while (is.available() > 0) {
//			while (is.available() > 0 || is.read() != -1) {
//				System.out.println("into while!");
				content.append(new String(bf, "utf-8"));
				//先等待10毫秒 避免重复循环次数太多
				Thread.sleep(10);
				int all = is.available();
				byte[] b = new byte[all];
				is.read(b);
				content.append(new String(b, "utf-8"));
			}

			message = content.toString();
			
			is.close();
			url = null;

		}
		//使用了一个不合适的host名字，不能进行DNS解析host,网络断开情况下不能进行DNS解析
		catch(UnknownHostException hx){
			System.out.println("抓取网页出现UnknownHostException错误,返回UnknownHostException值");
			hx.printStackTrace();
			return "UnknownHostException";
		}
		catch (IOException ex) {
			System.out.println("抓取网页出现IOException错误,返回空值");
			ex.printStackTrace();
			return "";
		} catch (Exception e) {
			System.out.println("抓取网页出现Exception错误");
			e.printStackTrace();
			return "";
		}
		return message;
	}
}
